| Sunrise logo | ![]() |
![]() |
|---|---|---|
These first two cells give us access to some external Python code we will need. Hover over the "[ ]" on the top left corner of the cell below and you should see a "play" button appear. Click on it to run the cell then move to the next one.
!git clone https://github.com/edgi-govdata-archiving/ECHO_modules.git
!git clone https://github.com/edgi-govdata-archiving/ECHO-Geo.git
!git clone -b first-draft --single-branch https://github.com/edgi-govdata-archiving/ECHO-Sunrise.git # This has the utilities file for mapping
# Import code libraries
%run ECHO_modules/DataSet.py
%run ECHO-Sunrise/utilities.py
import urllib.parse
import pandas as pd
!pip install geopandas
import geopandas
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import requests
import csv
import datetime
import ipywidgets as widgets
This may take some time to load - there are thousands of facilities!
echo_data_sql = "select * from ECHO_EXPORTER where FAC_STATE = 'MA' and FAC_ACTIVE_FLAG='Y' and GHG_FLAG='Y'" # 24000 facilities in total, but can we load them all?
try:
print(echo_data_sql)
echo_data = get_data( echo_data_sql, 'REGISTRY_ID' )
num_facilities = echo_data.shape[0]
print("\nThere are %s EPA facilities in Massachussets tracked in the ECHO database." %(num_facilities))
#mapper_marker(echo_data) # Not showing up...
except pd.errors.EmptyDataError:
print("\nThere are no EPA facilities in this region.\n")
mapper_marker(echo_data)
Here's where you can learn more about the different programs...
%run ECHO_modules/make_data_sets.py
# Only list the data set if it has the correct flag set.
data_set_choices = []
for k, v in data_sets.items():
if ( v.has_echo_flag( echo_data ) ):
data_set_choices.append( k )
data_set_widget=widgets.Dropdown(
options=list(data_set_choices),
description='Data sets:',
disabled=False,
value='Greenhouse Gases'
)
display(data_set_widget)
region_field = {
'Congressional District': { "field": 'FAC_DERIVED_CD113' },
'County': { "field": 'FAC_COUNTY' },
'Zip Code': { "field": 'FAC_DERIVED_ZIP' }
}
style = {'description_width': 'initial'}
select_region_widget = widgets.Dropdown(
options=region_field.keys(),
style=style,
value='County',
description='Region of interest:',
disabled=False
)
display( select_region_widget )
program = data_sets[ data_set_widget.value ]
program_data = None
key=dict() # Create a way to look up Registry IDs in ECHO_EXPORTER later
# We need to provide a custom list of program ids for some programs.
if ( program.name == "Air Inspections" or program.name == "Air Enforcements" ):
# The REGISTRY_ID field is the index of the echo_data
registry_ids = echo_data[echo_data['AIR_FLAG'] == 'Y'].index.values
key = { i : i for i in registry_ids }
program_data = program.get_data( ee_ids=registry_ids )
elif ( program.name == "Combined Air Emissions" ):
ghg_registry_ids = echo_data[echo_data['GHG_FLAG'] == 'Y'].index.values
tri_registry_ids = echo_data[echo_data['TRI_FLAG'] == 'Y'].index.values
id_set = np.union1d( ghg_registry_ids, tri_registry_ids )
registry_ids = list(id_set)
program_data = program.get_data( ee_ids=registry_ids )
key = { i : i for i in registry_ids }
elif ( program.name == "Greenhouse Gases" or program.name == "Toxic Releases" ):
program_flag = program.echo_type + '_FLAG'
registry_ids = echo_data[echo_data[ program_flag ] == 'Y'].index.values
program_data = program.get_data( ee_ids=registry_ids )
key = { i : i for i in registry_ids }
else:
ids_string = program.echo_type + '_IDS'
ids = list()
registry_ids = list()
for index, value in echo_data[ ids_string ].items():
try:
for this_id in value.split():
ids.append( this_id )
key[this_id]=index
except ( KeyError, AttributeError ) as e:
pass
program_data = program.get_data( ee_ids=ids )
# Find the facility that matches the program data, by REGISTRY_ID.
# Add lat and lon, facility name and REGISTRY_ID as fac_registry_id.
# (Note: not adding REGISTRY_ID right now as it is sometimes interpreted as an int and that messes with the charts...)
my_prog_data = pd.DataFrame()
no_data_ids = []
# Look through all the facilities in my area and program and get supplemental echo_data info
if (program_data is None): # Handle no data
print("Sorry, we don't have data for this program! That could be an error on our part, or ECHO's, or because the data type doesn't apply to this area.")
else:
for fac in program_data.itertuples():
fac_id = fac.Index
reg_id = key[fac_id] # Look up this facility's Registry ID through its Program ID
try:
echo_row = pd.DataFrame(echo_data.loc[reg_id].copy()).T.reset_index() # Find and filter to the corresponding row in ECHO_EXPORTER
echo_row = echo_row[['FAC_NAME', 'FAC_LAT', 'FAC_LONG']] # Keep only the columns we need
program_row = pd.DataFrame([list(fac)[1:]], columns=program_data.columns.values) # Turn the program_data tuple into a DataFrame
full_row = pd.concat([program_row, echo_row], axis=1) # Join the EE row df and the program row df
frames = [my_prog_data, full_row]
my_prog_data = pd.concat( frames, ignore_index=False)
except KeyError:
# The facility wasn't found in the program data.
no_data_ids.append( fac.Index )
# in ordert to map, roll up my_prog_data to facility level
fac = my_prog_data.drop_duplicates(subset=['PGM_SYS_ID']) # or whatever the key is
map_of_facilities = mapper_marker(fac)
map_of_facilities
# read in and map geojson for the selected geography
geo = "county" #select_region_widget.value.lower()
geo_json_data = geopandas.read_file("ECHO-Geo/ma_"+geo+".geojson")
m = folium.Map(
#tiles='Mapbox Bright',
)
folium.GeoJson(
geo_json_data,
).add_to(m)
bounds = m.get_bounds()
m.fit_bounds(bounds)
m
# first, spatialize my_prog_data
gdf = geopandas.GeoDataFrame(
my_prog_data, geometry=geopandas.points_from_xy(my_prog_data["FAC_LONG"], my_prog_data["FAC_LAT"]))
join = geopandas.sjoin(gdf, geo_json_data, how="inner", op='intersects')
# get geo and attribute data column names
geo_column = {"county": "COUNTY"} # EXPAND
att_column = {"Greenhouse Gases": "ANNUAL_EMISSION"} # EXPAND
g = geo_column[geo]
a = att_column[program.name]
test = join.groupby(join[g])[[a]].agg("sum")
test.sort_values(by=a, ascending = False)
test.reset_index(inplace=True)
att_data = test.rename(columns={g: "geo", a: "value"})
mp = mapper_area(geo_json_data, att_data, g)
mp
ranked = my_prog_data.groupby(["PGM_SYS_ID", "FAC_NAME", "FAC_LAT", "FAC_LONG"])[[a]].agg("sum")
ranked.reset_index(inplace=True)
ranked = ranked.set_index("PGM_SYS_ID")
ranked.sort_values(by=a, ascending=False)
ranked['quantile'] = pd.qcut(ranked[a], 4, labels=False)
mp = mapper_circle(ranked, a)
mp